knitr::opts_chunk$set(echo = TRUE)
# setwd("G:/My Drive/CAPSTONE/working")
library(dplyr)
library(tidyverse)
library(tidyr)
library(data.table)
library(ggcorrplot)
library(corrplot)
library(RColorBrewer)
library(devtools)
library(readr)
library(plotly)
library(ggplot2)
library(zoo)
library(rstatix)
library(scales)   # to access breaks/formatting functions
library(gridExtra) # for arranging plots
library(Hmisc)
library(randomForest)
library(mlbench)
library(e1071)
library(ggfortify)
library(pastecs)
test_df <- read.csv("test_df.csv", header = T, stringsAsFactors = F)
submission = read.csv("submission_format.csv", header = T, stringsAsFactors = F)

Load splitted test & training sets. Load Random Forest & SVM data. Impute test_df NAs with previous non-NA value using na.locf()

load("dengue_test_train_split.RData")
load("dengue_random_forest.RData")
load("dengue_SVM.RData")
test_df <- select(test_df, -c(total_cases))
for (i in c(5:24)) {test_df[,i] <- na.locf(test_df[,i])}
test_iq <- test_df %>% filter(city == "iq")
test_sj <- test_df %>% filter(city == "sj")

Random Forest Validation

Validation on lm_test_df from basic Random Forest created on train_df

rf_df_test = predict(rf_df_regressor, newdata=lm_test_df)
resid_df_test = rf_df_test - lm_test_df$total_cases

#Calculate MSE on lm_test_df dataset
mse_df <- print(mean(resid_df_test^2))
## [1] 666.8971

Validation on lm_test_df for model based on the number of trees generating the least MSE

rf_df_test2 = predict(rf_df_regressor2, newdata=lm_test_df)
resid_df_test2 = rf_df_test2 - lm_test_df$total_cases

#Calculate MSE on lm_test_df dataset
mse2_df <- print(mean(resid_df_test2^2))
## [1] 620.1547

Validation on lm_test_df based on the number of trees generating the least MSE & top 6 variables based on Node Purity

rf_df_test3 = predict(rf_df_regressor3, newdata=lm_test_df)
resid_df_test3 = rf_df_test3 - lm_test_df$total_cases
mse3_df <- print(mean(resid_df_test3^2))
## [1] 538.5882

Similarly, we implement the validation on the datsets split by city.

San Juan validation

1st regressor

rf_sj_test = predict(rf_sj_regressor, newdata=lm_test_sj)
resid_sj_test = rf_sj_test - lm_test_sj$total_cases
mse_sj <- print(mean(resid_sj_test^2))
## [1] 753.0785

2nd regressor

rf_sj_test2 = predict(rf_sj_regressor2, newdata=lm_test_sj)
resid_sj_test2 = rf_sj_test2 - lm_test_sj$total_cases
mse2_sj <- print(mean(resid_sj_test2^2))
## [1] 737.931

3rd regressor

rf_sj_test3 = predict(rf_sj_regressor3, newdata=lm_test_sj)
resid_sj_test3 = rf_sj_test3 - lm_test_sj$total_cases
mse3_sj <- print(mean(resid_sj_test3^2))
## [1] 790.4614

Iquitos validation

1st regressor

rf_iq_test = predict(rf_iq_regressor, newdata=lm_test_iq)
resid_iq_test = rf_iq_test - lm_test_iq$total_cases
mse_iq <- print(mean(resid_iq_test^2))
## [1] 52.23506

2nd regressor

rf_iq_test2 = predict(rf_iq_regressor2, newdata=lm_test_iq)
resid_iq_test2 = rf_iq_test2 - lm_test_iq$total_cases
mse2_iq <- print(mean(resid_iq_test2^2))
## [1] 52.31801

3rd regressor

rf_iq_test3 = predict(rf_iq_regressor3, newdata=lm_test_iq)
resid_iq_test3 = rf_iq_test3 - lm_test_iq$total_cases
mse3_iq <- print(mean(resid_iq_test3^2))
## [1] 60.39508

SVM Validation

train_df error

# As we have selected best model, we are predicting the outcomes.
# tunedModel_df_test <- predict(tunedModel_df,lm_test_df) 
error1 <- tunedModel_df_test - lm_test_df$total_cases 
# MSE
mse_svm_df <- print(mean(error1^2))
## [1] 800.2505

train_sj error

# tunedModel_sj <- tuneResult1$best.model
# tunedModel_sj_test <- predict(tunedModel_sj,lm_test_sj) 
error2 <- tunedModel_sj_test - lm_test_sj$total_cases 
mse_svm_sj <- print(mean(error2^2))
## [1] 856.6288

train_iq error

# tunedModel_iq <- tuneResult2$best.model
# tunedModel_iq_test <- predict(tunedModel_iq,lm_test_iq) 
error3 <- tunedModel_iq_test - lm_test_iq$total_cases 
mse_svm_iq <- print(mean(error3^2))
## [1] 61.12607

Create a dataframe for including the model name & its respective MSE score. Display the models by ascending MSE score

test_df_val <- data.frame(c('rf_df_regressor', 'rf_df_regressor2', 'rf_df_regressor3', 'rf_iq_regressor', 'rf_iq_regressor2', 'rf_iq_regressor3', 'rf_sj_regressor', 'rf_sj_regressor2', 'rf_sj_regressor3', 'tunedModel_df', 'tunedModel_iq', 'tunedModel_sj'), c(mse_df, mse2_df, mse3_df, mse_iq, mse2_iq, mse3_iq, mse_sj, mse2_sj, mse3_sj, mse_svm_df, mse_svm_iq, mse_svm_sj))
names(test_df_val) <- c('Model', 'MSE')
test_df_val[order(test_df_val$MSE),]
##               Model       MSE
## 4   rf_iq_regressor  52.23506
## 5  rf_iq_regressor2  52.31801
## 6  rf_iq_regressor3  60.39508
## 11    tunedModel_iq  61.12607
## 3  rf_df_regressor3 538.58819
## 2  rf_df_regressor2 620.15472
## 1   rf_df_regressor 666.89707
## 8  rf_sj_regressor2 737.93098
## 7   rf_sj_regressor 753.07851
## 9  rf_sj_regressor3 790.46135
## 10    tunedModel_df 800.25055
## 12    tunedModel_sj 856.62881

rf_iq_regressor (Random Forest Regressor 1 for Iquitos) is the best model for predicting in the Iquitos test set. rf_sj_regressor2 (Random Forest Regressor 2 for San Juan) is the best model for predicting in the San Juan test set. rf_df_regressor3 (Random Forest Regressor 3 for Entire set) is the best model for predicting the entire test set.

We pick the best model for the entire Test data (no city subsets) generated by both Random forest and SVM models.

We have rf_df_regressor3 model as the best prediction on the entire Test Set because it has produced the lease MSE

rf_df_testfinal = predict(rf_df_regressor3, newdata=test_df)

test_df_rf = cbind(test_df,rf_df_testfinal)
test_df_rf$rf_df_testfinal <- ceiling(test_df_rf$rf_df_testfinal)
test_df_rf <- test_df_rf %>% 
  rename(
    total_cases = rf_df_testfinal
    )
head(test_df_rf[,c(1:4,28)], n = 3)
##   city year weekofyear week_start_date total_cases
## 1   sj 2008         18      2008-04-29          11
## 2   sj 2008         19      2008-05-06          12
## 3   sj 2008         20      2008-05-13          21

SVM prediction on entire test set

tunedModel_df_testfinal <- predict(tunedModel_df,newdata=test_df)
test_df_svm= cbind(test_df,tunedModel_df_testfinal)
test_df_svm$tunedModel_df_testfinal <- ceiling(test_df_svm$tunedModel_df_testfinal)
test_df_svm <- test_df_svm %>% 
  rename(
    total_cases = tunedModel_df_testfinal
    )
head(test_df_svm[,c(1:4,28)], n = 3)
##   city year weekofyear week_start_date total_cases
## 1   sj 2008         18      2008-04-29          33
## 2   sj 2008         19      2008-05-06          10
## 3   sj 2008         20      2008-05-13          -1

Best model for San Juan derived from Random Forest: rf_sj_regressor2

tunedModel_sj_test <- predict(rf_sj_regressor2, newdata = test_sj) 

test_sj_rf = cbind(test_sj,tunedModel_sj_test)
test_sj_rf$total_cases<- tunedModel_sj_test
test_sj_rf <- select(test_sj_rf, -c(tunedModel_sj_test, month, day, day_of_year))
test_sj_rf$total_cases <- ceiling(test_sj_rf$total_cases)
head(test_sj_rf[,c(1:4,25)], n = 3)
##   city year weekofyear week_start_date total_cases
## 1   sj 2008         18      2008-04-29          14
## 2   sj 2008         19      2008-05-06          17
## 3   sj 2008         20      2008-05-13          25

Best model for Iquitos derived from Random Forest: rf_iq_regressor

tunedModel_iq_test <- predict(rf_iq_regressor,test_iq) 

test_iq_rf=cbind(test_iq,tunedModel_iq_test)
test_iq_rf$total_cases<- tunedModel_iq_test
test_iq_rf <- select(test_iq_rf, -c(tunedModel_iq_test, month, day, day_of_year))
test_iq_rf$total_cases <- ceiling(test_iq_rf$total_cases)
head(test_iq_rf[,c(1:4,25)], n = 3)
##   city year weekofyear week_start_date total_cases
## 1   iq 2010         26      2010-07-02          10
## 2   iq 2010         27      2010-07-09           4
## 3   iq 2010         28      2010-07-16           9

Combining the data for both cities into a data frame

test_combineddf_rf = rbind(test_sj_rf,test_iq_rf)
head(test_combineddf_rf[,c(1:4, 25)], n = 3)
##   city year weekofyear week_start_date total_cases
## 1   sj 2008         18      2008-04-29          14
## 2   sj 2008         19      2008-05-06          17
## 3   sj 2008         20      2008-05-13          25
tail(test_combineddf_rf[,c(1:4, 25)], n = 3)
##      city year weekofyear week_start_date total_cases
## 1541   iq 2013         24      2013-06-11           5
## 1551   iq 2013         25      2013-06-18           4
## 1561   iq 2013         26      2013-06-25           5

Finally, submission based on the best model from Random Forest developed on entire train_df dataset

submission1 <- submission[ ,1:3] # submission based on the best model from Random Forest developed on entire train_df dataset
submission1$total_cases<- rf_df_testfinal
submission1$total_cases <- ceiling(submission1$total_cases)
head(submission1, n = 3)
##   city year weekofyear total_cases
## 1   sj 2008         18          11
## 2   sj 2008         19          12
## 3   sj 2008         20          21
tail(submission1, n = 3)
##     city year weekofyear total_cases
## 414   iq 2013         24           3
## 415   iq 2013         25           6
## 416   iq 2013         26           4

Submission based on the best model from SVM developed on entire train_df dataset

submission2<-submission[ ,1:3] # submission based on the best model from SVM developed on entire train_df dataset
submission2$total_cases <- tunedModel_df_testfinal
submission2$total_cases <- ceiling(submission2$total_cases)
head(submission2, n = 3)
##   city year weekofyear total_cases
## 1   sj 2008         18          33
## 2   sj 2008         19          10
## 3   sj 2008         20          -1
tail(submission2, n = 3)
##     city year weekofyear total_cases
## 414   iq 2013         24           4
## 415   iq 2013         25          17
## 416   iq 2013         26          12

Submission by merging best outcomes from lm_test_sj and lm_test_iq (Random Forest)

submission3<- submission[ ,1:3] # submission by merging best outcomes from lm_test_sj and lm_test_iq
submission3$total_cases<-test_combineddf_rf$total_cases
submission3$total_cases <- ceiling(submission3$total_cases)
head(submission3, n = 3)
##   city year weekofyear total_cases
## 1   sj 2008         18          14
## 2   sj 2008         19          17
## 3   sj 2008         20          25
tail(submission3, n = 3)
##     city year weekofyear total_cases
## 414   iq 2013         24           5
## 415   iq 2013         25           4
## 416   iq 2013         26           5

Save submissions

write.csv(submission1,"submission1.csv", row.names=FALSE)
write.csv(submission2,"submission2.csv", row.names=FALSE)
write.csv(submission3,"submission3.csv", row.names=FALSE)

Plot for total_cases predicted by Year using the best Random Forest prediction on entire Test set

test_df_rf$week_start_date <- as.Date(test_df_rf$week_start_date)

cases_test_df <- ggplot(data = test_df_rf, aes(x = week_start_date, y = total_cases, color = city)) +
  geom_line() +
  labs(x = "Year",
       y = "Total Cases",
       title = "Time Series of Total Cases Prediction in Test Set (Random Forest)",
       subtitle = "")
cases_test_df + scale_x_date(date_breaks = "1 year", date_labels = "%y") + scale_y_continuous(breaks=seq(0, 125, 5)) 

Plot for total_cases predicted by Year from merged cities using Random Forest

test_combineddf_rf$week_start_date <- as.Date(test_df_rf$week_start_date)

cases_merged_df <- ggplot(data = test_combineddf_rf, aes(x = week_start_date, y = total_cases, color = city)) +
  geom_line() +
  labs(x = "Year",
       y = "Total Cases",
       title = "Time Series of Total Cases Prediction From Merged Cities (Random Forest)",
       subtitle = "")
cases_merged_df + scale_x_date(date_breaks = "1 year", date_labels = "%y") + scale_y_continuous(breaks=seq(0, 125, 5)) 

Plot for total_cases predicted by Year using the best SVM prediction on entire Test set

test_df_svm$week_start_date <- as.Date(test_df_svm$week_start_date)

cases_svm_df <- ggplot(data = test_df_svm, aes(x = week_start_date, y = total_cases, color = city)) +
  geom_line() +
  labs(x = "Year",
       y = "Total Cases",
       title = "Time Series of Total Cases Prediction in Test Set (SVM)",
       subtitle = "")
cases_svm_df + scale_x_date(date_breaks = "1 year", date_labels = "%y") + scale_y_continuous(breaks=seq(0, 425, 15)) 

Plot for actual total_cases vs predicted total_cases (Random Forest)

plot_ly(x = ~lm_test_df$week_start_date) %>%
  add_lines(y = ~lm_test_df$total_cases, name = "Actual", line = list(color = "red")) %>%
  add_lines(y = ~rf_df_test3, name = "Predicted", line = list(color = "blue"), yaxis = "y2") %>%
  
  layout(
    yaxis = list(
      side = "left",
      title = list("predicted vs actual")
    ),
    yaxis2 = list(
      side = "left",
      overlaying = "y",
      anchor = "free"
    ),
    margin = list(pad = 30)
  )